This report is meant to answer the following question. Can I predict how long a ride will from the time and place of departure, as well as some minimal information about the rider?
I'm examining three months of citibike rider data, from July through September of 2019, totaling about 7 million rides. The only significant peculiarity in the data, is a spike in the rider birth year for the year 1969. There are about 7 times more riders "born" in 1969 than the surrounding year. I would assume that 1969 is the default birth year when users create accounts.
As seen in this figure, 50% of all rides are 10 minutes or less and over 90% of all rides are shorter than 30 minutes. While there are less riders on Sundays as compared to the rest of the week, this does not explain daily ridership, as seen here. There is some other external factor, most probably weather, which has a stronger influence on the probability of a ride.
When trying to predict the length of a ride, none of the non-engineered variables are significantly correlated with trip duration, as seen here.
The code below was run on the following:
import pandas as pd
import altair as alt
import numpy as np
import pandas_profiling
from IPython.display import display, IFrame
import os
from geopy import distance
from typing import Tuple
# Run this cell only if you are running this in a jupyter notebook
alt.renderers.enable('notebook')
# Run this cell only if you want to export your notebook as an html
alt.renderers.enable('default')
def alt_hist(df: pd.DataFrame, col_of_int: str) -> pd.DataFrame:
"""
Create a dataframe of the counts of each value
"""
df_tmp = df[[col_of_int]].copy()
df_tmp["col_count"] = 1
df_tmp = df_tmp.groupby(col_of_int, as_index=False, sort=False).count()
df_tmp = df_tmp.sort_values(col_of_int).reset_index(drop=True)
return df_tmp
def standard_props(chart: alt.Chart) -> alt.Chart:
"""
Default values for Altair charts
"""
chart = (
chart.properties(height=300, width=700)
.configure_axis(labelFontSize=15, titleFontSize=20)
.configure_legend(
titleFontSize=18, labelFontSize=12, titleLimit=250, labelLimit=200
)
.configure_header(titleFontSize=20, labelFontSize=15)
)
return chart
def train_test_split(
df: pd.DataFrame, train_smp: int, test_smp: int, rand_state: int = 44
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Split the data into a training and testing dataframe.
"""
train_smp = min(train_smp, int(df.shape[0] * 0.8))
test_smp = min(test_smp, int(df.shape[0] * 0.2))
total_smp = train_smp + test_smp
df_smp = df.sample(total_smp, random_state=rand_state)
df_smp_train = df_smp.iloc[:train_smp, :]
df_smp_test = df_smp.iloc[train_smp:, :]
return (df_smp_train, df_smp_test)
!mkdir data
!mkdir data/raw
!wget -P data/raw https://s3.amazonaws.com/tripdata/201909-citibike-tripdata.csv.zip
!wget -P data/raw https://s3.amazonaws.com/tripdata/201908-citibike-tripdata.csv.zip
!wget -P data/raw https://s3.amazonaws.com/tripdata/201907-citibike-tripdata.csv.zip
!unzip -q data/raw/201909-citibike-tripdata.csv.zip -d data/raw/
!unzip -q data/raw/201908-citibike-tripdata.csv.zip -d data/raw/
!unzip -q data/raw/201907-citibike-tripdata.csv.zip -d data/raw/
cb_csv = [x for x in os.listdir("data/raw") if x.endswith(".csv")]
df_cb_list = []
for csv in cb_csv:
df_cb_list.append(pd.read_csv(os.path.join("data/raw",csv)))
df_cb = pd.concat(df_cb_list, ignore_index=True, sort=False)
del df_cb_list
df_cb = df_cb.rename(columns={x:x.replace(" ", "_") for x in df_cb.columns})
df_cb.to_feather("data/citibike_data_201907-201909.feather")
df_cb = pd.read_feather("data/citibike_data_201907-201909.feather")
df_cb.shape
df_cb.starttime = pd.to_datetime(df_cb.starttime, format="%Y-%m-%d %H:%M:%S.%f")
df_cb.stoptime = pd.to_datetime(df_cb.stoptime, format="%Y-%m-%d %H:%M:%S.%f")
# profile = df_cb.profile_report(title="Pandas Profiling Report")
# profile.to_file(output_file="data/citibike_data_report.html")
EDA General Guidelines
df_cb[["starttime", "stoptime", "tripduration"]].isna().sum()
(df_cb.starttime > df_cb.stoptime).sum()
starttime and the stoptime¶df_cb["calculatedduration"] = (df_cb.stoptime - df_cb.starttime).dt.total_seconds().astype(int)
df_cb.loc[df_cb.calculatedduration != df_cb.tripduration, ["calculatedduration", "tripduration"]]
df_cb["duration_diff"] = df_cb.calculatedduration - df_cb.tripduration
df_cb.duration_diff.value_counts()
df_cb["tripduration_min"] = (df_cb.tripduration/60).astype(int)
col_of_int = "tripduration_min"
col_title = "Trip Duration (Minutes)"
df_trip_dur = alt_hist(df_cb, col_of_int=col_of_int)
df_trip_dur["cum_prcnt"] = df_trip_dur.col_count.cumsum()/df_trip_dur.col_count.sum()
col_of_int = "tripduration_min"
col_title = "Trip Duration (Minutes)"
p = alt.Chart(df_trip_dur).mark_bar().encode(
x = alt.X(col_of_int, title=col_title, axis=alt.Axis(format="c")),
y = alt.Y("col_count", title="Count", axis=alt.Axis(format="s"), scale=alt.Scale(type="log", base=10)),
tooltip=[alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title)]
)
standard_props(p).interactive()
IFrame("tripdur_min.html", width="100%", height=400)
col_of_int = "tripduration_min"
col_title = "Trip Duration (Minutes)"
p1 = alt.Chart(df_trip_dur.loc[df_trip_dur.tripduration_min <= 80]).mark_bar().encode(
x=alt.X(
col_of_int, title=col_title, axis=alt.Axis(format="c")
),
y=alt.Y("col_count", title="Count", axis=alt.Axis(format="s")),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
)
p2 = alt.Chart(df_trip_dur.loc[df_trip_dur.tripduration_min <= 80]).mark_line(color="black").encode(
x=alt.X(
col_of_int, title=col_title, axis=alt.Axis(format="c")
),
y=alt.Y("cum_prcnt", title="Cumulative Percent", axis=alt.Axis(format="%", grid=True)),
)
standard_props(p1+p2).resolve_scale(y='independent')
df_cb["start_month"] = df_cb.starttime.dt.month
df_cb["start_day"] = df_cb.starttime.dt.day
df_cb["start_date"] = pd.to_datetime(df_cb.starttime.dt.date, format="%Y-%m-%d")
df_cb["start_dow"] = df_cb.starttime.dt.day_name()
df_cb["start_hour"] = df_cb.starttime.dt.hour
col_of_int = "start_date:T"
col_title = "Start Date"
df_start_date = alt_hist(df_cb, col_of_int="start_date")
df_start_date = df_start_date.merge(
df_cb[["start_date", "start_dow"]].drop_duplicates(), on="start_date", how="left"
)
df_start_date["weekend"] = False
df_start_date.loc[df_start_date.start_dow.isin(["Saturday", "Sunday"]), "weekend"] = True
p = (
alt.Chart(df_start_date)
.mark_bar()
.encode(
x=alt.X(col_of_int, title=col_title),
y=alt.Y("col_count", title="Count", axis=alt.Axis(format="s")),
color=alt.Color("weekend", title="Weekend"),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
alt.Tooltip("start_dow", title="Day of Week"),
],
)
)
standard_props(p).interactive()
col_of_int = "start_dow"
col_title = "Day of Week"
df_start_dow = df_start_date[["col_count", "start_dow"]].groupby("start_dow", as_index=False, sort=False).mean()
alt.Chart(df_start_dow).mark_bar().encode(
x=alt.X(
col_of_int,
title=col_title,
sort=[
"Sunday",
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
],
),
y=alt.Y(
"col_count",
title="Average Count",
axis=alt.Axis(format="s"),
scale=alt.Scale(zero=False),
),
tooltip=[
alt.Tooltip("col_count", title="Average Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
).properties(height=300, width=800).interactive()
col_of_int = "start_hour"
col_title = "Hour of Day"
df_start_hour = alt_hist(df_cb, col_of_int=col_of_int)
alt.Chart(df_start_hour).mark_bar().encode(
x=alt.X(
col_of_int,
title=col_title,
),
y=alt.Y("col_count", title="Count", axis=alt.Axis(format="s"), scale=alt.Scale(zero=False)),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
).properties(height=300, width=800)
df_cb.columns
df_cb[
[
"start_station_id",
"start_station_name",
"start_station_latitude",
"start_station_longitude",
"end_station_id",
"end_station_name",
"end_station_latitude",
"end_station_longitude",
]
].isna().sum()
df_cb.loc[(df_cb.start_station_name.notna()) & (df_cb.start_station_name.str.match("^\s*$"))]
df_cb.loc[(df_cb.end_station_name.notna()) & (df_cb.end_station_name.str.match("^\s*$"))]
df_cb.loc[df_cb.start_station_name.str.len()<10].start_station_name.unique()
df_cb.loc[df_cb.end_station_name.str.len()<10].end_station_name.unique()
set(df_cb.start_station_name.tolist() + df_cb.end_station_name.tolist())
NYCBS Depot locations are test locations and should be removed from the dataset
start_station_test = df_cb.loc[(df_cb.start_station_name.notna()) & (df_cb.start_station_name.str.contains("NYCBS"))]
print(start_station_test.shape)
df_cb = df_cb.loc[~df_cb.index.isin(start_station_test.index)]
end_station_test = df_cb.loc[(df_cb.end_station_name.notna()) & (df_cb.end_station_name.str.contains("NYCBS"))]
print(end_station_test.shape)
df_cb = df_cb.loc[~df_cb.index.isin(end_station_test.index)]
start_st = (
df_cb[
[
"start_station_id",
"start_station_name",
"start_station_latitude",
"start_station_longitude",
]
]
.drop_duplicates()
.dropna()
)
display(
start_st.loc[start_st.start_station_name.duplicated(keep=False)].sort_values(
"start_station_name"
)
)
end_st = (
df_cb[
[
"end_station_id",
"end_station_name",
"end_station_latitude",
"end_station_longitude",
]
]
.drop_duplicates()
.dropna()
)
display(
end_st.loc[end_st.end_station_name.duplicated(keep=False)].sort_values(
"end_station_name"
)
)
col_of_int = "start_station_name"
col_title = "Start Station Name"
df_start_stations = alt_hist(df_cb, col_of_int)
p = alt.Chart(df_start_stations).mark_bar().encode(
x=alt.X("col_count", title="Count", axis=alt.Axis(format="s")),
y=alt.Y(
col_of_int,
title = col_title,
sort=alt.EncodingSortField(
field="col_count",
order="descending",
),
axis= alt.Axis(labels=False)
),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
)
standard_props(p)
col_of_int = "end_station_name"
col_title = "End Station Name"
df_end_stations = alt_hist(df_cb, col_of_int)
p = alt.Chart(df_end_stations).mark_bar().encode(
x=alt.X("col_count", title="Count", axis=alt.Axis(format="s")),
y=alt.Y(
col_of_int,
title = col_title,
sort=alt.EncodingSortField(
field="col_count",
order="descending",
),
axis= alt.Axis(labels=False)
),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
)
standard_props(p)
df_cb[["usertype", "birth_year", "gender"]].isna().sum()
df_cb.usertype.value_counts(dropna=False)
df_cb.gender.value_counts(dropna=False)
I am assuming that 1 is male, 2 is female, and 0 is unknown/other
col_of_int = "birth_year"
col_title = "Birth Year"
df_birth_year = alt_hist(df_cb, col_of_int)
p = (
alt.Chart(df_birth_year)
.mark_bar()
.encode(
x=alt.X(col_of_int, title=col_title, axis=alt.Axis(format="c")),
y=alt.Y("col_count", title="Count", axis=alt.Axis(format="s")),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
)
)
standard_props(p).interactive()
p = (
alt.Chart(df_birth_year.loc[df_birth_year.birth_year < 1940])
.mark_bar()
.encode(
x=alt.X(col_of_int, title=col_title, axis=alt.Axis(format="c")),
y=alt.Y("col_count", title="Count", axis=alt.Axis(format="s")),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
)
)
standard_props(p).interactive()
p = (
alt.Chart(df_birth_year.loc[df_birth_year.birth_year > 1995])
.mark_bar()
.encode(
x=alt.X(col_of_int, title=col_title, axis=alt.Axis(format="c")),
y=alt.Y("col_count", title="Count", axis=alt.Axis(format="s")),
tooltip=[
alt.Tooltip("col_count", title="Count", format="~s"),
alt.Tooltip(col_of_int, title=col_title),
],
)
)
standard_props(p).interactive()
I would assume that 1969 is the default birth year value, hence its spike. To prevent them from skewing any analysis, I will replace them with null values. There are definitely some false values in that dataset, as I doubt there are any bikers over 120 years old, but there is no obvious cutoff. However, since they are on the edge of the data they should influence any trends.
df_cb.loc[df_cb.birth_year==1969, "birth_year"] = np.NAN
df_cb_corr = df_cb.corr()
df_cb_corr_melt = (
df_cb_corr.reset_index().rename(columns={"index": "var"}).melt(id_vars=["var"])
)
p = (
alt.Chart(df_cb_corr_melt)
.mark_square()
.encode(
x=alt.X("var", title=None),
y=alt.Y("variable", title=None),
color=alt.Color("value"),
tooltip=[
alt.Tooltip("var", title="Variable 1"),
alt.Tooltip("variable", title="Variable 2"),
alt.Tooltip("value", title="Correlation", format=".2%"),
],
)
)
standard_props(p)
df_latlong = (
df_cb[
[
"start_station_latitude",
"start_station_longitude",
"end_station_longitude",
"end_station_latitude",
]
]
.drop_duplicates()
.copy()
)
df_latlong.shape
df_latlong["start_lat_long"] = list(
zip(df_latlong.start_station_latitude, df_latlong.start_station_longitude)
)
df_latlong["end_lat_long"] = list(
zip(df_latlong.end_station_latitude, df_latlong.end_station_longitude)
)
df_latlong["trip_distance"] = df_latlong.apply(
lambda x: distance.distance(x.start_lat_long, x.end_lat_long).miles, axis=1
)
df_cb = df_cb.merge(
df_latlong,
on=[
"start_station_latitude",
"start_station_longitude",
"end_station_longitude",
"end_station_latitude",
],
how="left"
)
df_dist_time = df_cb.loc[df_cb.tripduration < (60 * 60)][
["trip_distance", "tripduration"]
].sample(5 * 10 ** 3, random_state=44)
p = (
alt.Chart(df_dist_time)
.mark_circle()
.encode(
x=alt.X("trip_distance", title="Trip Distance (Miles)"),
y=alt.Y(
"tripduration", title="Trip Duration (Seconds)", axis=alt.Axis(format="s")
),
tooltip=[
alt.Tooltip("tripduration", title="Duration (Seconds)", format="s"),
alt.Tooltip("trip_distance", title="Distance (Miles)"),
],
)
)
standard_props(p)
df_time_birth = df_cb.loc[df_cb.tripduration < (60 * 60)][
["birth_year", "tripduration"]
].sample(5 * 10 ** 3, random_state=44)
p = (
alt.Chart(df_time_birth)
.mark_circle(opacity=0.3)
.encode(
x=alt.X("birth_year", title="Birth Year", scale=alt.Scale(zero=False), axis=alt.Axis(format="d")),
y=alt.Y(
"tripduration", title="Trip Duration (Seconds)", axis=alt.Axis(format="s")
),
tooltip=[
alt.Tooltip("tripduration", title="Duration (Seconds)", format="s"),
alt.Tooltip("birth_year", title="Birth Year"),
],
)
)
standard_props(p)
col_of_int = "start_hour"
col_title = "Hour of Day"
df_dur_start = df_cb.loc[df_cb.tripduration < (60 * 60)][
["start_hour", "tripduration"]
].sample(5 * 10 ** 3, random_state=44)
p = (
alt.Chart(df_dur_start)
.mark_circle(opacity=0.3)
.encode(
x=alt.X(col_of_int, title=col_title, scale=alt.Scale(zero=False), axis=alt.Axis(format="d")),
y=alt.Y(
"tripduration", title="Trip Duration (Seconds)", axis=alt.Axis(format="s")
),
tooltip=[
alt.Tooltip("tripduration", title="Duration (Seconds)", format="s"),
alt.Tooltip(col_of_int, title=col_title),
],
)
)
standard_props(p)
df_cb.start_station_id.value_counts().head()
df_cb.tail()
df_min.dtypes
df_min.starttime.apply(lambda x: x.strftime('%H:%M'))
df_min = df_cb.loc[df_cb.start_station_id==519].starttime.dt.round("5min").dt.time.to_frame()
# df_min.starttime = pd.to_datetime(df_min.starttime, format="%H:%M:%S")
df_min["ct"] = 1
df_min = df_min.groupby("starttime", as_index=False).count()
df_min["starttime_date"] = pd.to_datetime(df_min.starttime, format="%H:%M:%S")
df_min.starttime = df_min.starttime.apply(lambda x: x.strftime('%H:%M'))
p = (
alt.Chart(df_min)
.mark_bar()
.encode(
x=alt.X("starttime_date:T", title="Start Time"),
y=alt.Y(
"ct", title="Count", axis=alt.Axis(format="s")
),
tooltip=[
alt.Tooltip("ct", title="Count", format="s"),
alt.Tooltip("starttime", title="Start Time"),
],
)
)
standard_props(p).interactive()
df_min
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o.init(min_mem_size="7G")
df_cb.columns
outcome_y = "tripduration_min"
train_sz = 5 * 10 ** 5
test_sz =3 * 10 ** 5
df_cb_model = df_cb[
[
"tripduration_min",
"starttime",
"start_station_id",
"start_station_name",
"start_station_latitude",
"start_station_longitude",
"bikeid",
"usertype",
"birth_year",
"gender",
"start_month",
"start_day",
"start_date",
"start_dow",
"start_hour",
]
].copy()
df_cb_model = df_cb_model.loc[df_cb_model.tripduration_min<60*4]
df_train, df_test = train_test_split(df_cb_model, train_smp=train_sz, test_smp=test_sz)
df_train.shape, df_test.shape
train_h2o = h2o.H2OFrame(df_train)
train_h2o[outcome_y] = train_h2o[outcome_y]
test_h2o = h2o.H2OFrame(df_test)
test_h2o[outcome_y] = test_h2o[outcome_y]
linear_param = {
"max_runtime_secs" : 25
,"nfolds":3
,"lambda_":0
,"remove_collinear_columns":True
,"alpha":0
# ,"family":'binomial'
,"standardize": True
}
x_cols = [x for x in df_cb_model.columns if x not in [outcome_y]]
h2o_glm = H2OGeneralizedLinearEstimator()
h2o_glm.train(x=x_cols, y=outcome_y, training_frame=train_h2o)
h2o_glm.model_performance(train_h2o)
h2o_glm.model_performance(test_h2o)
df_test["predicted_duration"] = h2o_glm.predict(test_h2o).as_data_frame()
df_test.columns
df_tmp = df_test.sample(5 * 10 ** 3, random_state=44)
p = (
alt.Chart(df_tmp)
.mark_circle(clip=True)
.encode(
x=alt.X(
"tripduration_min",
title="Trip Duration (Minutes)",
scale=alt.Scale(zero=False, domain=(0,40)),
axis=alt.Axis(format="d"),
),
y=alt.Y(
"predicted_duration",
title="Predicted Duration (Minutes)",
axis=alt.Axis(format="d"),
scale=alt.Scale(domain=(0,40))
),
tooltip=[
alt.Tooltip("tripduration_min", title="Duration", format="s"),
alt.Tooltip("predicted_duration", title="Predicted Duration"),
],
)
)
standard_props(p)
df_test.reset_index(drop=True).to_feather("../downloads/model_predictions.feather")